#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')
library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(knitr)
library(biomaRt)
library(clusterProfiler) ; library(ReactomePA) ; library(DOSE) ; library(org.Hs.eg.db)
Load preprocessed dataset (preprocessing code in 19_10_14_data_preprocessing.Rmd) and clustering (pipeline in 19_10_21_WGCNA.Rmd)
# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame
# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
# Clusterings
clusterings = read_csv('./../Data/clusters.csv')
# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>%
mutate(`gene-score`=ifelse(is.na(`gene-score`), 'Others', `gene-score`)) %>%
left_join(GO_neuronal, by='ID') %>% left_join(clusterings, by='ID') %>%
mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
mutate(gene.score=ifelse(`gene-score`=='Others' & Neuronal==1, 'Neuronal', `gene-score`),
significant=padj<0.05 & !is.na(padj))
clustering_selected = 'DynamicHybrid'
genes_info$Module = genes_info[,clustering_selected]
dataset = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'))
dataset$Module = dataset[,clustering_selected]
# Correct SFARI Scores
dataset$gene.score = genes_info$gene.score
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
rm(DE_info, GO_annotations, clusterings, getinfo, mart, dds, GO_neuronal)
Using the package clusterProfiler. Performing Gene Set Enrichment Analysis using the following datasets:
Gene Ontology
Disease Ontology
Disease Gene Network
KEGG
REACTOME
file_name = './../Data/GSEA_bonferroni.RData'
if(file.exists(file_name)){
load(file_name)
} else {
##############################################################################
# PREPARE DATASET
# Create dataset with top modules membership and removing the genes without an assigned module
EA_dataset = data.frame('ensembl_gene_id' = genes_info$ID, module = genes_info$Module) %>%
filter(genes_info$Module!='gray')
# Assign Entrez Gene Id to each gene
getinfo = c('ensembl_gene_id','entrezgene')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org')
biomart_output = getBM(attributes=getinfo, filters=c('ensembl_gene_id'),
values=genes_info$ID[genes_info$Module!='gray'], mart=mart)
EA_dataset = biomart_output %>% dplyr::rename('ID' = ensembl_gene_id) %>%
left_join(dataset %>% dplyr::select(ID, contains('MM.')), by='ID')
##############################################################################
# PERFORM ENRICHMENT
# Following https://yulab-smu.github.io/clusterProfiler-book/chapter8.html
modules = dataset$Module[dataset$Module!='gray'] %>% as.character %>% table %>% names
nPerm = 1e5 # 100 times more than the default
enrichment_GO = list() # Gene Ontology
enrichment_DO = list() # Disease Ontology
enrichment_DGN = list() # Disease Gene Networks
enrichment_KEGG = list() # Kyoto Encyclopedia of Genes and Genomes
enrichment_Reactome = list() # Reactome: Pathway db
for(module in modules){
cat('\n')
cat(paste0('Module: ', which(modules == module), '/', length(modules)))
geneList = EA_dataset[,paste0('MM.',substring(module,2))]
names(geneList) = EA_dataset[,'entrezgene'] %>% as.character
geneList = sort(geneList, decreasing = TRUE)
enrichment_GO[[module]] = gseGO(geneList, OrgDb = org.Hs.eg.db, pAdjustMethod = 'bonferroni',
pvalueCutoff = 0.1, nPerm = nPerm, verbose = FALSE, seed = TRUE) %>%
data.frame
enrichment_DO[[module]] = gseDO(geneList, pAdjustMethod = 'bonferroni', pvalueCutoff = 0.1,
nPerm = nPerm, verbose = FALSE, seed = TRUE) %>% data.frame
enrichment_DGN[[module]] = gseDGN(geneList, pAdjustMethod = 'bonferroni', pvalueCutoff = 0.1,
nPerm = nPerm, verbose = FALSE, seed = TRUE) %>% data.frame
enrichment_KEGG[[module]] = gseKEGG(geneList, organism = 'human', pAdjustMethod = 'bonferroni',
pvalueCutoff = 0.1, nPerm = nPerm, verbose = FALSE, seed = TRUE) %>%
data.frame
enrichment_Reactome[[module]] = gsePathway(geneList, organism = 'human', pAdjustMethod = 'bonferroni',
pvalueCutoff = 0.1, nPerm = nPerm, verbose = F, seed = T) %>%
data.frame
# Temporal save, just in case SFARI Genes enrichment fails
save(enrichment_GO, enrichment_DO, enrichment_DGN, enrichment_KEGG, enrichment_Reactome, file=file_name)
}
##############################################################################
# PERFROM ENRICHMENT FOR SFARI GENES
# BUILD MAPPING BETWEEN GENES AND SFARI
# Build TERM2GENE: dataframe of 2 columns with term and gene
term2gene = biomart_output %>%
left_join(genes_info %>% dplyr::select(ID, `gene-score`),
by = c('ensembl_gene_id'='ID')) %>% filter(`gene-score`!='Others') %>%
dplyr::select(-ensembl_gene_id) %>%
mutate('SFARI' = 'SFARI', `gene-score` = paste0('SFARI Score ',`gene-score`)) %>%
melt(id.vars = 'entrezgene') %>% dplyr::select(value, entrezgene) %>%
dplyr::rename('term' = value, 'gene' = entrezgene) %>% distinct
# PERFORM GSEA
enrichment_SFARI = list()
cat('\n\nORA OF SFARI GENES\n')
for(module in modules){
cat('\n')
cat(paste0('Module: ', which(modules == module), '/', length(modules)))
geneList = EA_dataset[,paste0('MM.',substring(module,2))]
names(geneList) = EA_dataset[,'entrezgene'] %>% as.character
geneList = sort(geneList, decreasing = TRUE)
enrichment_SFARI[[module]] = clusterProfiler::GSEA(geneList, pAdjustMethod = 'bonferroni', nPerm = nPerm,
TERM2GENE = term2gene, pvalueCutoff = 1, maxGSSize=1000,
verbose = FALSE, seed = TRUE) %>% data.frame
# Temporal save
save(enrichment_GO, enrichment_DO, enrichment_DGN, enrichment_KEGG, enrichment_Reactome,
enrichment_SFARI, file=file_name)
}
##############################################################################
# Save enrichment results
save(enrichment_GO, enrichment_DO, enrichment_DGN, enrichment_KEGG, enrichment_Reactome,
enrichment_SFARI, file=file_name)
rm(getinfo, mart, biomart_output, gene, module, term2gene, geneList, EA_dataset)
}
SFARI_genes_by_module = c()
for(module in names(enrichment_SFARI)){
module_info = enrichment_SFARI[[module]] %>% mutate(Module = module) %>% dplyr::select(Module, ID, pvalue, p.adjust, NES) %>%
mutate(pvalue = ifelse(NES>0, pvalue, 1-pvalue), p.adjust = ifelse(NES>0, p.adjust, 1-p.adjust))
SFARI_genes_by_module = rbind(SFARI_genes_by_module, module_info)
}
SFARI_genes_by_module = SFARI_genes_by_module %>% left_join(dataset %>% dplyr::select(Module, MTcor) %>%
group_by(Module,MTcor) %>% tally %>% ungroup, by = 'Module')
plot_data = SFARI_genes_by_module %>% filter(ID == 'SFARI')
ggplotly(plot_data %>% ggplot(aes(MTcor, pvalue, size=n)) + geom_point(color=plot_data$Module, alpha=0.5, aes(id=Module)) +
geom_smooth(color='#cccccc', size = 0.5, se=FALSE) + xlab('Module-Diagnosis Correlation') + ylab('Probability') +
ggtitle(paste0('
Corr = ', round(cor(plot_data$MTcor, plot_data$pvalue),2), ': Corr[Module-ASD corr<0] = ',
round(cor(plot_data$MTcor[plot_data$MTcor<0], plot_data$pvalue[plot_data$MTcor<0]),3),
' Corr[Module-ASD corr>0] = ',
round(cor(plot_data$MTcor[plot_data$MTcor>=0], plot_data$pvalue[plot_data$MTcor>=0]),2))) +
theme_minimal() + theme(legend.position = 'none') + scale_y_sqrt())
plot_data = SFARI_genes_by_module %>% filter (ID != 'SFARI') %>% mutate(color = SFARI_colour_hue(r = 1:3)[ID %>% substr(12,13) %>% as.numeric])
ggplotly(plot_data %>% ggplot(aes(MTcor, pvalue, size=n, color = color)) + geom_point(alpha=0.5, aes(id=Module)) + geom_smooth(size = 0.5, se=FALSE) +
xlab('Module-Diagnosis Correlation') + scale_colour_manual(values = SFARI_colour_hue(r=1:3)) + ylab('Probability') +
ggtitle('Enrichment by SFARI Gene Score') + theme_minimal() + theme(legend.position = 'none'))
plot_data = SFARI_genes_by_module %>% filter(ID == 'SFARI')
ggplotly(plot_data %>% ggplot(aes(MTcor, p.adjust, size=n)) + geom_point(color=plot_data$Module, alpha=0.5, aes(id=Module)) +
geom_hline(yintercept = 0.05, color = 'gray', linetype = 'dotted') + xlab('Module-Diagnosis Correlation') + ylab('Corrected p-values') + scale_y_log10() +
theme_minimal() + theme(legend.position = 'none'))
How can there be so many enriched modules?! it makes no sense unless SFARI Genes tend to have more extreme Module Memberships than the other genes, which would get them to the top of the sorted list easier than the rest
plot_data = dataset %>% dplyr::select(gene.score, contains('MM.')) %>% melt %>%
mutate(quant = cut(value, breaks = quantile(value, probs = seq(0,1,0.05)) %>% as.vector, labels = FALSE)) %>%
group_by(gene.score, quant) %>% tally %>% ungroup %>% ungroup
plot_data = plot_data %>% group_by(quant) %>% summarise(N = sum(n)) %>% ungroup %>% left_join(plot_data, by = 'quant') %>%
dplyr::select(quant, gene.score, n, N) %>% mutate(p = round(100*n/N,2)) %>% filter(!is.na(quant)) %>%
mutate(gene.score = factor(gene.score, levels = rev(c('1','2','3','Neuronal','Others'))))
ggplotly(plot_data %>% filter(!gene.score %in% c('Neuronal','Others')) %>% ggplot(aes(quant, p, fill = gene.score)) +
geom_bar(stat='identity') + xlab('Module Membership Quantiles') + ylab('% of SFARI Genes') + #geom_smooth(color = 'gray', se = FALSE) +
scale_fill_manual(values = SFARI_colour_hue(r=rev(c(1:3)))) + theme_minimal() + theme(legend.position = 'none'))
Is GSEA the right approach to measure functional/pathway enrichment in modules?
In general, the higher the Module Membership of a gene, the more likely it is to be assigned to a module, but there are exceptions
Ordering defined by Module Membership is a continuous scale, it doesn’t indicate where the module ends
modules = dataset$Module[dataset$Module!='gray'] %>% as.character %>% table %>% names
plot_data = dataset %>% dplyr::select(Module, contains('MM.')) %>% melt %>% mutate(in_module = substring(Module,2) == substring(variable,4)) %>%
mutate(alpha = ifelse(in_module, 0.8, 0.2))
## Using Module as id variables
p = plot_data %>% filter(Module == modules[1]) %>% ggplot(aes(Module, value, color = in_module)) +
geom_jitter(alpha = plot_data$alpha[plot_data$Module == modules[1]]) + theme_minimal() +
theme(legend.position = 'none') + xlab('') + ylab('Module Membership') + coord_flip()
ggExtra::ggMarginal(p, type = 'density', groupColour = TRUE, groupFill = TRUE, margins = 'x')